package cn.datawin.spider.util;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;

public class UrlUtil {

	public static String getHost(String url) {
		String host = url;
		int i = StringUtils.ordinalIndexOf(url, "/", 3);
		if (i > 0) {
			host = StringUtils.substring(url, 0, i);
		}
		return host;
	}

	private static Pattern patternForProtocal = Pattern.compile("[\\w]+://");

	public static String removeProtocol(String url) {
		return patternForProtocal.matcher(url).replaceAll("");
	}

	public static String getDomain(String url) {
		String domain = removeProtocol(url);
		int i = StringUtils.indexOf(domain, "/", 1);
		if (i > 0) {
			domain = StringUtils.substring(domain, 0, i);
		}
		return domain;
	}

	/**
	 * 去掉 page=1 这种参数
	 * 
	 * @param url
	 * @return
	 */
	public static String getUri(String url) {
		String parseUrl=null;
		try {
			URL absoluteUrl = new URL(url);
		    parseUrl= absoluteUrl.getProtocol()+"://"+absoluteUrl.getHost()+absoluteUrl.getPath();
		} catch (MalformedURLException e) {
			e.printStackTrace();
		}
		return parseUrl.toString();
	}

	public static String replace(String url, String regex, String value) {
		return Pattern.compile(regex).matcher(url).replaceAll(value);
	}

	public static boolean isUrl(String url) {
		return url.startsWith("http://");
	}

	public static boolean isAbUrl(String url) {
		return url.startsWith("/");
	}

	public static boolean isNabUrl(String url) {
		if (isUrl(url))		return false;
		if (isAbUrl(url))	return false;
		return true;
	}
	
	
    public static String canonicalizeUrl(String url, String refer) {
        URL base;
        try {
            try {
                base = new URL(refer);
            } catch (MalformedURLException e) {
                // the base is unsuitable, but the attribute may be abs on its own, so try that
                URL abs = new URL(refer);
                return abs.toExternalForm();
            }
            // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
            if (url.startsWith("?"))
                url = base.getPath() + url;
            URL abs = new URL(base, url);
            return encodeIllegalCharacterInUrl(abs.toExternalForm());
        } catch (MalformedURLException e) {
            return "";
        }
    }

    /**
     *
     * @param url
     * @return
     */
    public static String encodeIllegalCharacterInUrl(String url) {
        //TODO more charator support
        return url.replace(" ", "%20");
    }


}
